The link for the dataset https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import plotly.express as px
# 2 lines below for html export
import plotly.io as pio
pio.renderers.default ='notebook'
# # 2 lines below for PDF export
# !pip install Pyppeteer
# !pyppeteer-install
# Setting for all rows and columns
pd.set_option('display.max_rows',2000)
pd.set_option("display.max_columns",500)
# Reading the dataset & view top 5 rows
hr_df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
hr_df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
# Bottom 5 rows
hr_df.tail()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1465 | 36 | No | Travel_Frequently | 884 | Research & Development | 23 | 2 | Medical | 1 | 2061 | 3 | Male | 41 | 4 | 2 | Laboratory Technician | 4 | Married | 2571 | 12290 | 4 | Y | No | 17 | 3 | 3 | 80 | 1 | 17 | 3 | 3 | 5 | 2 | 0 | 3 |
| 1466 | 39 | No | Travel_Rarely | 613 | Research & Development | 6 | 1 | Medical | 1 | 2062 | 4 | Male | 42 | 2 | 3 | Healthcare Representative | 1 | Married | 9991 | 21457 | 4 | Y | No | 15 | 3 | 1 | 80 | 1 | 9 | 5 | 3 | 7 | 7 | 1 | 7 |
| 1467 | 27 | No | Travel_Rarely | 155 | Research & Development | 4 | 3 | Life Sciences | 1 | 2064 | 2 | Male | 87 | 4 | 2 | Manufacturing Director | 2 | Married | 6142 | 5174 | 1 | Y | Yes | 20 | 4 | 2 | 80 | 1 | 6 | 0 | 3 | 6 | 2 | 0 | 3 |
| 1468 | 49 | No | Travel_Frequently | 1023 | Sales | 2 | 3 | Medical | 1 | 2065 | 4 | Male | 63 | 2 | 2 | Sales Executive | 2 | Married | 5390 | 13243 | 2 | Y | No | 14 | 3 | 4 | 80 | 0 | 17 | 3 | 2 | 9 | 6 | 0 | 8 |
| 1469 | 34 | No | Travel_Rarely | 628 | Research & Development | 8 | 3 | Medical | 1 | 2068 | 2 | Male | 82 | 4 | 2 | Laboratory Technician | 3 | Married | 4404 | 10228 | 2 | Y | No | 12 | 3 | 1 | 80 | 0 | 6 | 3 | 4 | 4 | 3 | 1 | 2 |
# Shape of the dataset
hr_df.shape
(1470, 35)
# Lets see the columns/features
hr_df.columns
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager'],
dtype='object')
# Information about the dataset(non_null values, datatype etc)
hr_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1470 non-null int64 1 Attrition 1470 non-null object 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null int64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null int64 6 Education 1470 non-null int64 7 EducationField 1470 non-null object 8 EmployeeCount 1470 non-null int64 9 EmployeeNumber 1470 non-null int64 10 EnvironmentSatisfaction 1470 non-null int64 11 Gender 1470 non-null object 12 HourlyRate 1470 non-null int64 13 JobInvolvement 1470 non-null int64 14 JobLevel 1470 non-null int64 15 JobRole 1470 non-null object 16 JobSatisfaction 1470 non-null int64 17 MaritalStatus 1470 non-null object 18 MonthlyIncome 1470 non-null int64 19 MonthlyRate 1470 non-null int64 20 NumCompaniesWorked 1470 non-null int64 21 Over18 1470 non-null object 22 OverTime 1470 non-null object 23 PercentSalaryHike 1470 non-null int64 24 PerformanceRating 1470 non-null int64 25 RelationshipSatisfaction 1470 non-null int64 26 StandardHours 1470 non-null int64 27 StockOptionLevel 1470 non-null int64 28 TotalWorkingYears 1470 non-null int64 29 TrainingTimesLastYear 1470 non-null int64 30 WorkLifeBalance 1470 non-null int64 31 YearsAtCompany 1470 non-null int64 32 YearsInCurrentRole 1470 non-null int64 33 YearsSinceLastPromotion 1470 non-null int64 34 YearsWithCurrManager 1470 non-null int64 dtypes: int64(26), object(9) memory usage: 402.1+ KB
# Values counts
hr_df['Attrition'].value_counts()
No 1233 Yes 237 Name: Attrition, dtype: int64
# Attrition rate
hr_df['Attrition'].value_counts(normalize=True)
No 0.838776 Yes 0.161224 Name: Attrition, dtype: float64
# Pie chart for attrition rate
attrition_df = hr_df.groupby('Attrition')['Age'].count().reset_index()
attrition_df.rename(columns={"Age":"Counts"}, inplace=True)
attrition_df
fig = px.pie(attrition_df, values="Counts", names="Attrition", title='Attrition Rate')
fig.show()
1. The attrition rate is higher and it is 16.1%
# 5 Point summary for the age
hr_df["Age"].describe()
count 1470.000000 mean 36.923810 std 9.135373 min 18.000000 25% 30.000000 50% 36.000000 75% 43.000000 max 60.000000 Name: Age, dtype: float64
# Outlier if any
fig = px.box(data_frame=hr_df,y="Age", title='Age Distribution',orientation='v')
fig.show()
1. The average age of employee is 36.
2. The minimum age is above 18 & maximum age is 60.
# Frequency for each category
hr_df['BusinessTravel'].value_counts()
Travel_Rarely 1043 Travel_Frequently 277 Non-Travel 150 Name: BusinessTravel, dtype: int64
# Countplot
businesstravel_df = hr_df['BusinessTravel'].value_counts().reset_index()
businesstravel_df.rename(columns={'index':"BusinessTravel",'BusinessTravel':"Counts"}, inplace=True)
fig = px.bar(data_frame=businesstravel_df,x='BusinessTravel',
y="Counts",
text="Counts",
title="BusinessTravel Distribution")
fig.show()
# Percentage distribution of categories
hr_df['BusinessTravel'].value_counts(normalize=True)
Travel_Rarely 0.709524 Travel_Frequently 0.188435 Non-Travel 0.102041 Name: BusinessTravel, dtype: float64
# 5 Point statistical summary
hr_df['DailyRate'].describe()
count 1470.000000 mean 802.485714 std 403.509100 min 102.000000 25% 465.000000 50% 802.000000 75% 1157.000000 max 1499.000000 Name: DailyRate, dtype: float64
# DailyRate distribution
fig = px.histogram(hr_df, x="DailyRate",
nbins=30,
title="DailyRate Distribution")
fig.show()
# Outlier Detection
fig = px.box(data_frame=hr_df,y="DailyRate",
title="DailyRate Distribution")
fig.show()
1. The average daily rate is 800$.
2. The 50% Employees daily rate is almost eqal to average daily rate.
# Frequency for department
hr_df['Department'].value_counts()
Research & Development 961 Sales 446 Human Resources 63 Name: Department, dtype: int64
# Pie chart for percentage distribution
dept_df = hr_df.groupby('Department')['Age'].count().reset_index().reset_index()
dept_df.rename(columns={"Age":"Counts"}, inplace=True)
fig = px.pie(data_frame=dept_df,
names=dept_df['Department'],
values=dept_df['Counts'],
title="Department Distribution")
fig.show()
# 5 Point summary
hr_df['DistanceFromHome'].describe()
count 1470.000000 mean 9.192517 std 8.106864 min 1.000000 25% 2.000000 50% 7.000000 75% 14.000000 max 29.000000 Name: DistanceFromHome, dtype: float64
# Ditribution
fig = px.histogram(data_frame=hr_df,x="DistanceFromHome", nbins=15, title="Distance From Home Distribution")
fig.show()
# Outliers if any
fig = px.box(data_frame=hr_df, y="DistanceFromHome")
fig.show()
1. Most employees prefer staying closer to office which is good option.
2. Average commute distance is 9km.
hr_df['Education'].value_counts()
3 572 4 398 2 282 1 170 5 48 Name: Education, dtype: int64
# Replacing the values for better interpretation
edu_dict = {1 :'Below College',2 :'College',3:'Bachelor',4: 'Master',5 :'Doctor'}
hr_df['Education'] = hr_df['Education'].replace(edu_dict)
hr_df['Education'].value_counts()
Bachelor 572 Master 398 College 282 Below College 170 Doctor 48 Name: Education, dtype: int64
edu_df = hr_df.groupby("Education")['Age'].count().reset_index()
edu_df.rename(columns={"Age":"Frequency"}, inplace=True)
fig = px.pie(data_frame=edu_df,values="Frequency",names='Education',title='Education Distribution')
fig.show()
edufld_df = hr_df.groupby("EducationField")['Age'].count().reset_index()
edufld_df.rename(columns={"Age":"Frequency"}, inplace=True)
fig = px.pie(data_frame=edufld_df,values="Frequency",names='EducationField',title='EducationField Distribution')
fig.show()
1. 41% employees have Life Science educational background.
2. Least 2% employees are from Human Resource as field of study.
env_dict = {1 :'Low',2 :'Medium',3: 'High',4: 'Very High'}
hr_df['EnvironmentSatisfaction'] = hr_df['EnvironmentSatisfaction'].replace(env_dict)
hr_df['EnvironmentSatisfaction'].value_counts()
High 453 Very High 446 Medium 287 Low 284 Name: EnvironmentSatisfaction, dtype: int64
env_df = hr_df.groupby("EnvironmentSatisfaction")['Age'].count().reset_index()
env_df.rename(columns={"Age":"Frequency"}, inplace=True)
fig = px.pie(data_frame=env_df,values="Frequency",names='EnvironmentSatisfaction',title='EnvironmentSatisfaction Distribution')
fig.show()
# gender distribution
hr_df['Gender'].value_counts()
Male 882 Female 588 Name: Gender, dtype: int64
# Pie chart for gender distribution
fig = px.pie(data_frame=hr_df,
names=hr_df['Gender'].value_counts().index,
values=hr_df['Gender'].value_counts().values,
title="Gender Distribution")
fig.show()
# HourlyRate distribution(histogram)
fig = px.histogram(data_frame=hr_df,
x=hr_df['HourlyRate'],
nbins=100,
title="Hours Rate Distribution")
fig.show()
1. The hours rate is uniformally distributed.
2. There are more employees with hourly rate in between 70 to 90.
# JobInvolvement Value counts
job_dict = {1 :'Low',2 :'Medium',3: 'High',4: 'Very High'}
hr_df['JobInvolvement'] = hr_df['JobInvolvement'].replace(job_dict)
hr_df['JobInvolvement'].value_counts(normalize=True)
High 0.590476 Medium 0.255102 Very High 0.097959 Low 0.056463 Name: JobInvolvement, dtype: float64
# JobInvolvement Distribution
fig = px.histogram(data_frame=hr_df,
x = 'JobInvolvement',
barmode = 'group',
title = 'JobInvolvement Distribution',
text_auto = 'JobInvolvement')
fig.show()
# Categorical count in JobRole
hr_df['JobRole'].value_counts()
Sales Executive 326 Research Scientist 292 Laboratory Technician 259 Manufacturing Director 145 Healthcare Representative 131 Manager 102 Sales Representative 83 Research Director 80 Human Resources 52 Name: JobRole, dtype: int64
# Pie chart to see the percent distribution in JobRole
fig = px.pie(data_frame=hr_df,
names= hr_df['JobRole'].value_counts().index,
values=hr_df['JobRole'].value_counts().values,
title='Job Role Distribution')
fig.show()
# MaritalStatus distribution
hr_df['MaritalStatus'].value_counts(normalize=True)
Married 0.457823 Single 0.319728 Divorced 0.222449 Name: MaritalStatus, dtype: float64
# Countplot
fig = px.histogram(data_frame=hr_df,
x="MaritalStatus",
barmode='group',
text_auto="MaritalStatus",
title="Marital Status Distribution")
fig.show()
# 5 point summary for MonthlyIncome
hr_df['MonthlyIncome'].describe()
count 1470.000000 mean 6502.931293 std 4707.956783 min 1009.000000 25% 2911.000000 50% 4919.000000 75% 8379.000000 max 19999.000000 Name: MonthlyIncome, dtype: float64
# boxplot
fig = px.box(data_frame=hr_df,
y='MonthlyIncome',
title='MonthlyIncome Distribution')
fig.show()
# Histogram
fig = px.histogram(data_frame=hr_df,
x='MonthlyIncome',
title="MonthlyIncome Distribution")
fig.show()
1. The average salary is 6500.
2. There are outliers above 16K salary.
3. The data is non-normally distributed and it is right skewed with median salary of 5K.
4. The max salary is nearly 20K while minimum salary is 1k.
1. High attrition is among the employees whose salary is below 5K.
hr_df.head()
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | Yes | Travel_Rarely | 1102 | Sales | 1 | College | Life Sciences | 1 | 1 | Medium | Female | 94 | High | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 1 | 49 | No | Travel_Frequently | 279 | Research & Development | 8 | Below College | Life Sciences | 1 | 2 | High | Male | 61 | Medium | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 2 | 37 | Yes | Travel_Rarely | 1373 | Research & Development | 2 | College | Other | 1 | 4 | Very High | Male | 92 | Medium | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 3 | 33 | No | Travel_Frequently | 1392 | Research & Development | 3 | Master | Life Sciences | 1 | 5 | Very High | Female | 56 | High | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 4 | 27 | No | Travel_Rarely | 591 | Research & Development | 2 | Below College | Medical | 1 | 7 | Low | Male | 40 | High | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
# NumCompaniesWorked
hr_df['NumCompaniesWorked'].value_counts()
1 521 0 197 3 159 2 146 4 139 7 74 6 70 5 63 9 52 8 49 Name: NumCompaniesWorked, dtype: int64
1. Fresher or one company switched employees are more and there is almost equal distribution above 5.
hr_df['Over18'].unique()
array(['Y'], dtype=object)
# As we have single value in over 18, we can drop this variable.
hr_df.drop("Over18", axis=1, inplace=True)
# Statistical summary
hr_df['PercentSalaryHike'].describe()
count 1470.000000 mean 15.209524 std 3.659938 min 11.000000 25% 12.000000 50% 14.000000 75% 18.000000 max 25.000000 Name: PercentSalaryHike, dtype: float64
# PercentSalaryHike Distribution
fig = px.histogram(data_frame=hr_df,
x="PercentSalaryHike")
fig.show()
1. The average salary hike is 15%.
2. Minimum salary hike is 11% and maximum salary 25%.
3. The data is not normally distributed and it is left skewed which means lot of employees salary hike is nearly below 15% and very few employees are getting more than 22%.
1. High hike percentage employees are switched to this organization.
2. There is high attrition in below average salary hike of 15%.